1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.solr.internal.csv;
18
19 import java.io.IOException;
20 import java.io.Reader;
21 import java.io.InputStreamReader;
22 import java.io.InputStream;
23 import java.util.ArrayList;
24
25
26 /**
27 * Parses CSV files according to the specified configuration.
28 *
29 * Because CSV appears in many different dialects, the parser supports many
30 * configuration settings by allowing the specification of a {@link CSVStrategy}.
31 *
32 * <p>Parsing of a csv-string having tabs as separators,
33 * '"' as an optional value encapsulator, and comments starting with '#':</p>
34 * <pre>
35 * String[][] data =
36 * (new CSVParser(new StringReader("a\tb\nc\td"), new CSVStrategy('\t','"','#'))).getAllValues();
37 * </pre>
38 *
39 * <p>Parsing of a csv-string in Excel CSV format</p>
40 * <pre>
41 * String[][] data =
42 * (new CSVParser(new StringReader("a;b\nc;d"), CSVStrategy.EXCEL_STRATEGY)).getAllValues();
43 * </pre>
44 *
45 * <p>
46 * Internal parser state is completely covered by the strategy
47 * and the reader-state.</p>
48 *
49 * <p>see <a href="package-summary.html">package documentation</a>
50 * for more details</p>
51 */
52 public class CSVParser {
53
54 /** length of the initial token (content-)buffer */
55 private static final int INITIAL_TOKEN_LENGTH = 50;
56
57 // the token types
58 /** Token has no valid content, i.e. is in its initialized state. */
59 protected static final int TT_INVALID = -1;
60 /** Token with content, at beginning or in the middle of a line. */
61 protected static final int TT_TOKEN = 0;
62 /** Token (which can have content) when end of file is reached. */
63 protected static final int TT_EOF = 1;
64 /** Token with content when end of a line is reached. */
65 protected static final int TT_EORECORD = 2;
66
67 /** Immutable empty String array. */
68 private static final String[] EMPTY_STRING_ARRAY = new String[0];
69
70 // the input stream
71 private final ExtendedBufferedReader in;
72
73 private final CSVStrategy strategy;
74
75 // the following objects are shared to reduce garbage
76 /** A record buffer for getLine(). Grows as necessary and is reused. */
77 private final ArrayList record = new ArrayList();
78 private final Token reusableToken = new Token();
79 private final CharBuffer wsBuf = new CharBuffer();
80 private final CharBuffer code = new CharBuffer(4);
81
82
83 /**
84 * Token is an internal token representation.
85 *
86 * It is used as contract between the lexer and the parser.
87 */
88 static class Token {
89 /** Token type, see TT_xxx constants. */
90 int type = TT_INVALID;
91 /** The content buffer. */
92 CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH);
93 /** Token ready flag: indicates a valid token with content (ready for the parser). */
94 boolean isReady;
95
96 Token reset() {
97 content.clear();
98 type = TT_INVALID;
99 isReady = false;
100 return this;
101 }
102 }
103
104 // ======================================================
105 // the constructor
106 // ======================================================
107
108 /**
109 * CSV parser using the default {@link CSVStrategy}.
110 *
111 * @param input a Reader containing "csv-formatted" input
112 */
113 public CSVParser(Reader input) {
114 // note: must match default-CSV-strategy !!
115 this(input, ',');
116 }
117
118 /**
119 * Customized value delimiter parser.
120 *
121 * The parser follows the default {@link CSVStrategy}
122 * except for the delimiter setting.
123 *
124 * @param input a Reader based on "csv-formatted" input
125 * @param delimiter a Char used for value separation
126 * @deprecated use {@link #CSVParser(Reader,CSVStrategy)}.
127 */
128 public CSVParser(Reader input, char delimiter) {
129 this(input, delimiter, '"', CSVStrategy.COMMENTS_DISABLED);
130 }
131
132 /**
133 * Customized csv parser.
134 *
135 * The parser parses according to the given CSV dialect settings.
136 * Leading whitespaces are truncated, unicode escapes are
137 * not interpreted and empty lines are ignored.
138 *
139 * @param input a Reader based on "csv-formatted" input
140 * @param delimiter a Char used for value separation
141 * @param encapsulator a Char used as value encapsulation marker
142 * @param commentStart a Char used for comment identification
143 * @deprecated use {@link #CSVParser(Reader,CSVStrategy)}.
144 */
145 public CSVParser(Reader input, char delimiter, char encapsulator, char commentStart) {
146 this(input, new CSVStrategy(delimiter, encapsulator, commentStart));
147 }
148
149 /**
150 * Customized CSV parser using the given {@link CSVStrategy}
151 *
152 * @param input a Reader containing "csv-formatted" input
153 * @param strategy the CSVStrategy used for CSV parsing
154 */
155 public CSVParser(Reader input, CSVStrategy strategy) {
156 this.in = new ExtendedBufferedReader(input);
157 this.strategy = strategy;
158 }
159
160 // ======================================================
161 // the parser
162 // ======================================================
163
164 /**
165 * Parses the CSV according to the given strategy
166 * and returns the content as an array of records
167 * (whereas records are arrays of single values).
168 * <p>
169 * The returned content starts at the current parse-position in
170 * the stream.
171 *
172 * @return matrix of records x values ('null' when end of file)
173 * @throws IOException on parse error or input read-failure
174 */
175 public String[][] getAllValues() throws IOException {
176 ArrayList records = new ArrayList();
177 String[] values;
178 String[][] ret = null;
179 while ((values = getLine()) != null) {
180 records.add(values);
181 }
182 if (records.size() > 0) {
183 ret = new String[records.size()][];
184 records.toArray(ret);
185 }
186 return ret;
187 }
188
189 /**
190 * Parses the CSV according to the given strategy
191 * and returns the next csv-value as string.
192 *
193 * @return next value in the input stream ('null' when end of file)
194 * @throws IOException on parse error or input read-failure
195 */
196 public String nextValue() throws IOException {
197 Token tkn = nextToken();
198 String ret = null;
199 switch (tkn.type) {
200 case TT_TOKEN:
201 case TT_EORECORD:
202 ret = tkn.content.toString();
203 break;
204 case TT_EOF:
205 ret = null;
206 break;
207 case TT_INVALID:
208 default:
209 // error no token available (or error)
210 throw new IOException(
211 "(line " + getLineNumber()
212 + ") invalid parse sequence");
213 // unreachable: break;
214 }
215 return ret;
216 }
217
218 /**
219 * Parses from the current point in the stream til
220 * the end of the current line.
221 *
222 * @return array of values til end of line
223 * ('null' when end of file has been reached)
224 * @throws IOException on parse error or input read-failure
225 */
226 public String[] getLine() throws IOException {
227 String[] ret = EMPTY_STRING_ARRAY;
228 record.clear();
229 while (true) {
230 reusableToken.reset();
231 nextToken(reusableToken);
232 switch (reusableToken.type) {
233 case TT_TOKEN:
234 record.add(reusableToken.content.toString());
235 break;
236 case TT_EORECORD:
237 record.add(reusableToken.content.toString());
238 break;
239 case TT_EOF:
240 if (reusableToken.isReady) {
241 record.add(reusableToken.content.toString());
242 } else {
243 ret = null;
244 }
245 break;
246 case TT_INVALID:
247 default:
248 // error: throw IOException
249 throw new IOException("(line " + getLineNumber() + ") invalid parse sequence");
250 // unreachable: break;
251 }
252 if (reusableToken.type != TT_TOKEN) {
253 break;
254 }
255 }
256 if (!record.isEmpty()) {
257 ret = (String[]) record.toArray(new String[record.size()]);
258 }
259 return ret;
260 }
261
262 /**
263 * Returns the current line number in the input stream.
264 *
265 * ATTENTION: in case your csv has multiline-values the returned
266 * number does not correspond to the record-number
267 *
268 * @return current line number
269 */
270 public int getLineNumber() {
271 return in.getLineNumber();
272 }
273
274 // ======================================================
275 // the lexer(s)
276 // ======================================================
277
278 /**
279 * Convenience method for <code>nextToken(null)</code>.
280 */
281 protected Token nextToken() throws IOException {
282 return nextToken(new Token());
283 }
284
285 /**
286 * Returns the next token.
287 *
288 * A token corresponds to a term, a record change or an
289 * end-of-file indicator.
290 *
291 * @param tkn an existing Token object to reuse. The caller is responsible to initialize the
292 * Token.
293 * @return the next token found
294 * @throws IOException on stream access error
295 */
296 protected Token nextToken(Token tkn) throws IOException {
297 wsBuf.clear(); // resuse
298
299 // get the last read char (required for empty line detection)
300 int lastChar = in.readAgain();
301
302 // read the next char and set eol
303 /* note: unfourtunately isEndOfLine may consumes a character silently.
304 * this has no effect outside of the method. so a simple workaround
305 * is to call 'readAgain' on the stream...
306 * uh: might using objects instead of base-types (jdk1.5 autoboxing!)
307 */
308 int c = in.read();
309 boolean eol = isEndOfLine(c);
310 c = in.readAgain();
311
312 // empty line detection: eol AND (last char was EOL or beginning)
313 while (strategy.getIgnoreEmptyLines() && eol
314 && (lastChar == '\n'
315 || lastChar == ExtendedBufferedReader.UNDEFINED)
316 && !isEndOfFile(lastChar)) {
317 // go on char ahead ...
318 lastChar = c;
319 c = in.read();
320 eol = isEndOfLine(c);
321 c = in.readAgain();
322 // reached end of file without any content (empty line at the end)
323 if (isEndOfFile(c)) {
324 tkn.type = TT_EOF;
325 return tkn;
326 }
327 }
328
329 // did we reached eof during the last iteration already ? TT_EOF
330 if (isEndOfFile(lastChar) || (lastChar != strategy.getDelimiter() && isEndOfFile(c))) {
331 tkn.type = TT_EOF;
332 return tkn;
333 }
334
335 // important: make sure a new char gets consumed in each iteration
336 while (!tkn.isReady && tkn.type != TT_EOF) {
337 // ignore whitespaces at beginning of a token
338 while (strategy.getIgnoreLeadingWhitespaces() && isWhitespace(c) && !eol) {
339 wsBuf.append((char) c);
340 c = in.read();
341 eol = isEndOfLine(c);
342 }
343 // ok, start of token reached: comment, encapsulated, or token
344 if (c == strategy.getCommentStart()) {
345 // ignore everything till end of line and continue (incr linecount)
346 in.readLine();
347 tkn = nextToken(tkn.reset());
348 } else if (c == strategy.getDelimiter()) {
349 // empty token return TT_TOKEN("")
350 tkn.type = TT_TOKEN;
351 tkn.isReady = true;
352 } else if (eol) {
353 // empty token return TT_EORECORD("")
354 //noop: tkn.content.append("");
355 tkn.type = TT_EORECORD;
356 tkn.isReady = true;
357 } else if (c == strategy.getEncapsulator()) {
358 // consume encapsulated token
359 encapsulatedTokenLexer(tkn, c);
360 } else if (isEndOfFile(c)) {
361 // end of file return TT_EOF()
362 //noop: tkn.content.append("");
363 tkn.type = TT_EOF;
364 tkn.isReady = true;
365 } else {
366 // next token must be a simple token
367 // add removed blanks when not ignoring whitespace chars...
368 if (!strategy.getIgnoreLeadingWhitespaces()) {
369 tkn.content.append(wsBuf);
370 }
371 simpleTokenLexer(tkn, c);
372 }
373 }
374 return tkn;
375 }
376
377 /**
378 * A simple token lexer
379 *
380 * Simple token are tokens which are not surrounded by encapsulators.
381 * A simple token might contain escaped delimiters (as \, or \;). The
382 * token is finished when one of the following conditions become true:
383 * <ul>
384 * <li>end of line has been reached (TT_EORECORD)</li>
385 * <li>end of stream has been reached (TT_EOF)</li>
386 * <li>an unescaped delimiter has been reached (TT_TOKEN)</li>
387 * </ul>
388 *
389 * @param tkn the current token
390 * @param c the current character
391 * @return the filled token
392 *
393 * @throws IOException on stream access error
394 */
395 private Token simpleTokenLexer(Token tkn, int c) throws IOException {
396 for (;;) {
397 if (isEndOfLine(c)) {
398 // end of record
399 tkn.type = TT_EORECORD;
400 tkn.isReady = true;
401 break;
402 } else if (isEndOfFile(c)) {
403 // end of file
404 tkn.type = TT_EOF;
405 tkn.isReady = true;
406 break;
407 } else if (c == strategy.getDelimiter()) {
408 // end of token
409 tkn.type = TT_TOKEN;
410 tkn.isReady = true;
411 break;
412 } else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') {
413 // interpret unicode escaped chars (like \u0070 -> p)
414 tkn.content.append((char) unicodeEscapeLexer(c));
415 } else if (c == strategy.getEscape()) {
416 tkn.content.append((char)readEscape(c));
417 } else {
418 tkn.content.append((char) c);
419 }
420
421 c = in.read();
422 }
423
424 if (strategy.getIgnoreTrailingWhitespaces()) {
425 tkn.content.trimTrailingWhitespace();
426 }
427
428 return tkn;
429 }
430
431
432 /**
433 * An encapsulated token lexer
434 *
435 * Encapsulated tokens are surrounded by the given encapsulating-string.
436 * The encapsulator itself might be included in the token using a
437 * doubling syntax (as "", '') or using escaping (as in \", \').
438 * Whitespaces before and after an encapsulated token are ignored.
439 *
440 * @param tkn the current token
441 * @param c the current character
442 * @return a valid token object
443 * @throws IOException on invalid state
444 */
445 private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
446 // save current line
447 int startLineNumber = getLineNumber();
448 // ignore the given delimiter
449 // assert c == delimiter;
450 for (;;) {
451 c = in.read();
452
453 if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead()=='u') {
454 tkn.content.append((char) unicodeEscapeLexer(c));
455 } else if (c == strategy.getEscape()) {
456 tkn.content.append((char)readEscape(c));
457 } else if (c == strategy.getEncapsulator()) {
458 if (in.lookAhead() == strategy.getEncapsulator()) {
459 // double or escaped encapsulator -> add single encapsulator to token
460 c = in.read();
461 tkn.content.append((char) c);
462 } else {
463 // token finish mark (encapsulator) reached: ignore whitespace till delimiter
464 for (;;) {
465 c = in.read();
466 if (c == strategy.getDelimiter()) {
467 tkn.type = TT_TOKEN;
468 tkn.isReady = true;
469 return tkn;
470 } else if (isEndOfFile(c)) {
471 tkn.type = TT_EOF;
472 tkn.isReady = true;
473 return tkn;
474 } else if (isEndOfLine(c)) {
475 // ok eo token reached
476 tkn.type = TT_EORECORD;
477 tkn.isReady = true;
478 return tkn;
479 } else if (!isWhitespace(c)) {
480 // error invalid char between token and next delimiter
481 throw new IOException(
482 "(line " + getLineNumber()
483 + ") invalid char between encapsulated token end delimiter"
484 );
485 }
486 }
487 }
488 } else if (isEndOfFile(c)) {
489 // error condition (end of file before end of token)
490 throw new IOException(
491 "(startline " + startLineNumber + ")"
492 + "eof reached before encapsulated token finished"
493 );
494 } else {
495 // consume character
496 tkn.content.append((char) c);
497 }
498 }
499 }
500
501
502 /**
503 * Decodes Unicode escapes.
504 *
505 * Interpretation of "\\uXXXX" escape sequences
506 * where XXXX is a hex-number.
507 * @param c current char which is discarded because it's the "\\" of "\\uXXXX"
508 * @return the decoded character
509 * @throws IOException on wrong unicode escape sequence or read error
510 */
511 protected int unicodeEscapeLexer(int c) throws IOException {
512 int ret = 0;
513 // ignore 'u' (assume c==\ now) and read 4 hex digits
514 c = in.read();
515 code.clear();
516 try {
517 for (int i = 0; i < 4; i++) {
518 c = in.read();
519 if (isEndOfFile(c) || isEndOfLine(c)) {
520 throw new NumberFormatException("number too short");
521 }
522 code.append((char) c);
523 }
524 ret = Integer.parseInt(code.toString(), 16);
525 } catch (NumberFormatException e) {
526 throw new IOException(
527 "(line " + getLineNumber() + ") Wrong unicode escape sequence found '"
528 + code.toString() + "'" + e.toString());
529 }
530 return ret;
531 }
532
533 private int readEscape(int c) throws IOException {
534 // assume c is the escape char (normally a backslash)
535 c = in.read();
536 int out;
537 switch (c) {
538 case 'r': out='\r'; break;
539 case 'n': out='\n'; break;
540 case 't': out='\t'; break;
541 case 'b': out='\b'; break;
542 case 'f': out='\f'; break;
543 default : out=c;
544 }
545 return out;
546 }
547
548 // ======================================================
549 // strategies
550 // ======================================================
551
552 /**
553 * Obtain the specified CSV Strategy. This should not be modified.
554 *
555 * @return strategy currently being used
556 */
557 public CSVStrategy getStrategy() {
558 return this.strategy;
559 }
560
561 // ======================================================
562 // Character class checker
563 // ======================================================
564
565 /**
566 * @return true if the given char is a whitespace character
567 */
568 private boolean isWhitespace(int c) {
569 return Character.isWhitespace((char) c) && (c != strategy.getDelimiter());
570 }
571
572 /**
573 * Greedy - accepts \n and \r\n
574 * This checker consumes silently the second control-character...
575 *
576 * @return true if the given character is a line-terminator
577 */
578 private boolean isEndOfLine(int c) throws IOException {
579 // check if we have \r\n...
580 if (c == '\r') {
581 if (in.lookAhead() == '\n') {
582 // note: does not change c outside of this method !!
583 c = in.read();
584 }
585 }
586 return (c == '\n');
587 }
588
589 /**
590 * @return true if the given character indicates end of file
591 */
592 private boolean isEndOfFile(int c) {
593 return c == ExtendedBufferedReader.END_OF_STREAM;
594 }
595 }